{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6ed29811-2add-40fb-981e-e1da95e02ae7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from scipy.stats import pearsonr, spearmanr\n",
    "import scipy.sparse as sparse\n",
    "from scipy.stats import bernoulli, poisson\n",
    "import analysis_utils_mine as utils\n",
    "\n",
    "import json\n",
    "import pandas as pd\n",
    "import ast\n",
    "from datetime import datetime\n",
    "import torch\n",
    "import pandas as pd\n",
    "from datetime import datetime, timedelta\n",
    "import pickle\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import xlsxwriter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2985f848-6046-40f9-b742-1a5dad4ebd67",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_dataframe_from_annotated_xlsx_file_path(path):\n",
    "    df = pd.read_excel(path,\n",
    "                       sheet_name=None,\n",
    "                       engine='openpyxl')\n",
    "    df = df['Sheet1']\n",
    "    df = df[~pd.isnull(df['Topic Name'])]\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6ad84cac-1b2b-4582-b962-8ddc9d5047aa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 50 entries, 1 to 1569\n",
      "Data columns (total 7 columns):\n",
      " #   Column          Non-Null Count  Dtype  \n",
      "---  ------          --------------  -----  \n",
      " 0   Topic           50 non-null     object \n",
      " 1   Unnamed: 1      0 non-null      float64\n",
      " 2   Coherence       50 non-null     float64\n",
      " 3   Polarization    48 non-null     float64\n",
      " 4   Topic Name      50 non-null     object \n",
      " 5   Description     48 non-null     object \n",
      " 6   Notes/Comments  10 non-null     object \n",
      "dtypes: float64(3), object(4)\n",
      "memory usage: 3.1+ KB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Unknown extension is not supported and will be removed\n",
      "  warn(msg)\n",
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Conditional Formatting extension is not supported and will be removed\n",
      "  warn(msg)\n"
     ]
    }
   ],
   "source": [
    "annotator1_speeches = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_1/topics_for_annotation.xlsx_frazier_speech.xlsx')\n",
    "annotator1_speeches.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "15f36679-bb1a-45a7-bc82-17405a3afb87",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 50 entries, 1 to 1569\n",
      "Data columns (total 7 columns):\n",
      " #   Column          Non-Null Count  Dtype  \n",
      "---  ------          --------------  -----  \n",
      " 0   Topic           50 non-null     object \n",
      " 1   Unnamed: 1      0 non-null      float64\n",
      " 2   Coherence       50 non-null     float64\n",
      " 3   Polarization    50 non-null     float64\n",
      " 4   Topic Name      50 non-null     object \n",
      " 5   Description     50 non-null     object \n",
      " 6   Notes/Comments  50 non-null     object \n",
      "dtypes: float64(3), object(4)\n",
      "memory usage: 3.1+ KB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Unknown extension is not supported and will be removed\n",
      "  warn(msg)\n",
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Conditional Formatting extension is not supported and will be removed\n",
      "  warn(msg)\n"
     ]
    }
   ],
   "source": [
    "annotator1_tweets = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_1/topics_for_annotation.xlsx_Frazier_tweet.xlsx')\n",
    "annotator1_tweets.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a7c63daa-9678-43a7-acaf-595525a68ad8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 50 entries, 1 to 1569\n",
      "Data columns (total 7 columns):\n",
      " #   Column          Non-Null Count  Dtype  \n",
      "---  ------          --------------  -----  \n",
      " 0   Topic           50 non-null     object \n",
      " 1   Unnamed: 1      0 non-null      float64\n",
      " 2   Coherence       50 non-null     float64\n",
      " 3   Polarization    41 non-null     float64\n",
      " 4   Topic Name      50 non-null     object \n",
      " 5   Description     43 non-null     object \n",
      " 6   Notes/Comments  16 non-null     object \n",
      "dtypes: float64(3), object(4)\n",
      "memory usage: 3.1+ KB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Unknown extension is not supported and will be removed\n",
      "  warn(msg)\n",
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Conditional Formatting extension is not supported and will be removed\n",
      "  warn(msg)\n"
     ]
    }
   ],
   "source": [
    "annotator2_speeches = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_2/topics_for_annotation_Hightower_speech.xlsx')\n",
    "annotator2_speeches.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "5d872add-baf4-4638-b3da-5df9e7ae5e2f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 50 entries, 1 to 1569\n",
      "Data columns (total 7 columns):\n",
      " #   Column          Non-Null Count  Dtype  \n",
      "---  ------          --------------  -----  \n",
      " 0   Topic           50 non-null     object \n",
      " 1   Unnamed: 1      0 non-null      float64\n",
      " 2   Coherence       50 non-null     float64\n",
      " 3   Polarization    36 non-null     float64\n",
      " 4   Topic Name      50 non-null     object \n",
      " 5   Description     38 non-null     object \n",
      " 6   Notes/Comments  22 non-null     object \n",
      "dtypes: float64(3), object(4)\n",
      "memory usage: 3.1+ KB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Unknown extension is not supported and will be removed\n",
      "  warn(msg)\n",
      "/workspace/.conda/envs/tbip/lib/python3.6/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Conditional Formatting extension is not supported and will be removed\n",
      "  warn(msg)\n"
     ]
    }
   ],
   "source": [
    "annotator2_tweets = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_2/topics_for_annotation_Hightower_tweets.xlsx')\n",
    "annotator2_tweets.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "bd401ecb-605a-49c9-9c4e-46df6a17004d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.isnan(dict(zip(annotator2_tweets['Topic'], annotator2_tweets['Notes/Comments']))['Topic 3'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "f1bfe2f5-b502-418e-bd18-e355e0ad5574",
   "metadata": {},
   "outputs": [],
   "source": [
    "discard_labels_to_one_discard_label_map = {'DISCARD': 'DISCARD',\n",
    "                                           'Discard': 'DISCARD',\n",
    "                                           'Disgard': 'DISCARD',\n",
    "                                           'discard': 'DISCARD'}\n",
    "                                           "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "8dc3f63c-11e4-4cbc-9a3c-e7879e8543b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def two_dataframes_to_one_excel_file_for_consensus_labeling(df1, \n",
    "                                                            df2, \n",
    "                                                            discard_labels_to_one_discard_label_map,\n",
    "                                                            outpath):\n",
    "    N = len(df1)\n",
    "    assert len(df1) == len(df2)\n",
    "    assert list(df1['Topic']) == list(df2['Topic'])\n",
    "    \n",
    "    topics = list(df1['Topic'])\n",
    "    \n",
    "    topic_to_name1 = dict(zip(df1['Topic'], df1['Topic Name']))\n",
    "    topic_to_desc1 = dict(zip(df1['Topic'], df1['Description']))\n",
    "    topic_to_notes1 = dict(zip(df1['Topic'], df1['Notes/Comments']))\n",
    "    \n",
    "    topic_to_name2 = dict(zip(df2['Topic'], df2['Topic Name']))\n",
    "    topic_to_desc2 = dict(zip(df2['Topic'], df2['Description']))\n",
    "    topic_to_notes2 = dict(zip(df2['Topic'], df2['Notes/Comments']))\n",
    "    \n",
    "    \n",
    "    out_topics, out_names1, out_descs1, out_notes1, out_names2, out_descs2, out_notes2 = [], [], [], [], [], [], []\n",
    "    for topic in topics:\n",
    "        name1 = topic_to_name1[topic]\n",
    "        name2 = topic_to_name2[topic]\n",
    "        if name1 in discard_labels_to_one_discard_label_map and name2 in discard_labels_to_one_discard_label_map:\n",
    "            continue\n",
    "        else:\n",
    "            if name1 in discard_labels_to_one_discard_label_map:\n",
    "                name1 = discard_labels_to_one_discard_label_map[name1]\n",
    "            if name2 in discard_labels_to_one_discard_label_map:\n",
    "                name2 = discard_labels_to_one_discard_label_map[name2]\n",
    "            out_topics.append(topic)\n",
    "            out_names1.append(name1) \n",
    "            out_descs1.append(topic_to_desc1[topic])\n",
    "            out_notes1.append(topic_to_notes1[topic])\n",
    "            out_names2.append(name2) \n",
    "            out_descs2.append(topic_to_desc2[topic])\n",
    "            out_notes2.append(topic_to_notes2[topic])\n",
    "    \n",
    "    print('Number of Topics for consensus labeling, after discarding those labeled as DISCARD by both annotators = ' + str(len(out_topics)))\n",
    "    \n",
    "    workbook = xlsxwriter.Workbook(outpath)\n",
    "    workbook.formats[0].set_font_size(12)\n",
    "    worksheet = workbook.add_worksheet()\n",
    "    worksheet.freeze_panes(1, 0)\n",
    "    \n",
    "    # Add a format for the header cells.\n",
    "    header_format = workbook.add_format({\n",
    "        'bottom': 1,\n",
    "        'top':1,\n",
    "        'font_size':12,\n",
    "        #'bg_color': '#C6EFCE',\n",
    "        'bold': True,\n",
    "        'text_wrap': True,\n",
    "        'valign': 'top',\n",
    "        'align': 'center',\n",
    "        #'indent': 1,\n",
    "    })\n",
    "    \n",
    "    header_format_with_rborder = workbook.add_format({\n",
    "        'bottom': 1,\n",
    "        'top':1,\n",
    "        'right':1,\n",
    "        'font_size':12,\n",
    "        #'bg_color': '#C6EFCE',\n",
    "        'bold': True,\n",
    "        'text_wrap': True,\n",
    "        'valign': 'top',\n",
    "        'align': 'center',\n",
    "        #'indent': 1,\n",
    "    })\n",
    "\n",
    "    # Set up layout of the worksheet.\n",
    "    worksheet.set_column('A:A', 10)\n",
    "    worksheet.set_column('B:B', 30)\n",
    "    worksheet.set_column('C:C', 75)\n",
    "    worksheet.set_column('D:D', 75)\n",
    "    worksheet.set_column('E:E', 30)\n",
    "    worksheet.set_column('F:F', 75)\n",
    "    worksheet.set_column('G:G', 75)\n",
    "    worksheet.set_column('H:H', 3)\n",
    "    worksheet.set_column('I:I', 30)\n",
    "    worksheet.set_column('J:J', 150)\n",
    "    worksheet.set_row(0, len(out_topics) + 1)\n",
    "    \n",
    "\n",
    "    # Write the header cells and some data that will be used in the examples.\n",
    "    worksheet.write('A1', \n",
    "                    'Topic', \n",
    "                    header_format)\n",
    "    worksheet.write('B1', \n",
    "                    'Name 1', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('C1', \n",
    "                    'Description 1', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('D1', \n",
    "                    'Notes 1',\n",
    "                   header_format_with_rborder)\n",
    "    \n",
    "    worksheet.write('E1', \n",
    "                    'Name 2', \n",
    "                    header_format)\n",
    "    worksheet.write('F1', \n",
    "                    'Description 2', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('G1', \n",
    "                    'Notes 2', \n",
    "                    header_format_with_rborder)\n",
    "    \n",
    "    worksheet.write('H1', \n",
    "                    '', \n",
    "                    header_format_with_rborder)\n",
    "    \n",
    "    worksheet.write('I1', \n",
    "                    'Consensus Topic Name', \n",
    "                    header_format)\n",
    "    \n",
    "    worksheet.write('J1', \n",
    "                    'Notes/Comments', \n",
    "                    header_format)\n",
    "    \n",
    "    rborder = workbook.add_format({'right': 1})\n",
    "    twrap = workbook.add_format({'text_wrap': True})\n",
    "    rborder_twrap = workbook.add_format({'right': 1, 'text_wrap': True})\n",
    "    \n",
    "    on_row = 2\n",
    "    for t, n1, d1, notes1, n2, d2, notes2 in zip(out_topics, out_names1, out_descs1, out_notes1, out_names2, out_descs2, out_notes2):\n",
    "        worksheet.write('A' + str(on_row), t, twrap)\n",
    "        worksheet.write('B' + str(on_row), n1, twrap)\n",
    "        if type(d1) == str:\n",
    "            worksheet.write('C' + str(on_row), d1, twrap)\n",
    "        worksheet.write('D' + str(on_row), '', rborder)\n",
    "        if type(notes1) == str:\n",
    "            worksheet.write('D' + str(on_row), notes1, rborder_twrap)\n",
    "        worksheet.write('E' + str(on_row), n2, twrap)\n",
    "        if type(d2) == str:\n",
    "            worksheet.write('F' + str(on_row), d2, twrap)\n",
    "        worksheet.write('G' + str(on_row), '', rborder)\n",
    "        if type(notes2) == str:\n",
    "            worksheet.write('G' + str(on_row), notes2, rborder_twrap)\n",
    "        \n",
    "        worksheet.write('H' + str(on_row), '', rborder)\n",
    "        on_row += 1\n",
    "    workbook.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "9dd3a797-89e2-4331-a48e-b0d312ef5eb4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Topics for consensus labeling, after discarding those labeled as DISCARD by both annotators = 49\n"
     ]
    }
   ],
   "source": [
    "two_dataframes_to_one_excel_file_for_consensus_labeling(annotator1_speeches,\n",
    "                                                        annotator2_speeches,\n",
    "                                                        discard_labels_to_one_discard_label_map,\n",
    "                                                        'venue_diff_polsci/consensus_topic_labeling_files/speeches_consensus_labeling.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "34ea78b2-6203-4617-aea1-4103b2aaa2b8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Topics for consensus labeling, after discarding those labeled as DISCARD by both annotators = 50\n"
     ]
    }
   ],
   "source": [
    "two_dataframes_to_one_excel_file_for_consensus_labeling(annotator1_tweets,\n",
    "                                                        annotator2_tweets,\n",
    "                                                        discard_labels_to_one_discard_label_map,\n",
    "                                                        'venue_diff_polsci/consensus_topic_labeling_files/tweets_consensus_labeling.xlsx')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:tbip] *",
   "language": "python",
   "name": "conda-env-tbip-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
